/*========================== begin_copyright_notice ============================

Copyright (C) 2017 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

#include "Compiler/CISACodeGen/VectorProcess.hpp"
#include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
#include "Compiler/CISACodeGen/EmitVISAPass.hpp"
#include "Compiler/IGCPassSupport.h"
#include "common/IGCIRBuilder.h"
#include "common/LLVMWarningsPush.hpp"
#include "llvmWrapper/Support/Alignment.h"
#include "llvmWrapper/IR/DerivedTypes.h"
#include <llvm/IR/DataLayout.h>
#include <llvm/IR/Instructions.h>
#include <llvm/IR/IRBuilder.h>
#include <llvm/IR/InstIterator.h>
#include "common/LLVMWarningsPop.hpp"
#include "Probe/Assertion.h"

using namespace llvm;
using namespace IGC;
using IGCLLVM::FixedVectorType;

//
// Description of VectorProcess Pass
//   The pass is to do data layout of vector explicitly by inserting bitcasts.
//   These bitcasts have special meaning and cannot be deleted. We insert
//   those bitcasts right before emitting vISA code so that the most codegen
//   passes will not need to special-handle those bitcasts.
//
// As we assume that vector type (in llvm ir) is in a "packed form", which means
// that when we group several workitems (each llvm code is a single workitem)
// into a single thread, the elements of a vector in LLVM IR are no longer
// consecutive in their GRF. For example, given <n x T> v,  its vISA
// variable under SIMD8 (group 8 workitems into a single thread) will be
// laid out as follow (For readability, C variables are used and C's struct
// layout is assumed):
//     struct { T c0, c1, c2, c3, c4, c5, c6, c7 } visaVar[n];
//     where c0, c1, ... c7 represent values for simd lane 0 -- 7,
//     respectively. For example, assume the original workitem 0 is at SIMD
//     lane 0, and its vector v for lane 0 will be
//       visaVar[0].c0, visaVar[1].c0, visaVar[2].c0,...... visaVar[n-1].c0,
//     which are no longer consecutive in visaVar.
//
// This layout is not guaranteed to be efficiently generated by gathers/scatters.
// For example,  <16xi8> can be generated by 16 1-byte byte scattered Reads, each
// read reads 1 byte for every lane;  but <16xi8> can be viewed as <4xi32>. And
// a single gather4 can get entire <4xi32>. Thus, to have an efficient message,
// the original vector could be "re-layout" to a different vector type that can
// be mapped to send message more efficently. But this "re-layout" has cost,
// that is, we will have to generate mov instructions (maybe a lot), as shown
// below:
//    <16xi8> v
//       struct { i8 c0, c1, ..., c7 } visaVar_v[16];
//       Note: this array of struct is required in IGC (referred to as
//             packed form).
//
//    <4xi32> v_as4xi32
//       struct { i32 c0, c1, ..., c7 } visaVar_v_as4xi32[4]; or
//          struct { i8 c0[4], c1[4], ..., c7[4] } visaVar_v_as4xi32[4];
//          note: each element of the array is actually a struct of array!
//       visaVar_v_as4xi32 = gather4 &v
//
//
//    To convert <4xi32> back to <16xi8> (required as packed-form), the
//    following is needed:
//       for(i=0; i < 4; ++i)
//         for(j=0; j < 4; ++j)
//            visaVar_v[i*4 + j].c0 = visaVar_v_as4xi32[i].c0[j];
//            ......
//            visaVar_v[i*4 + j].c7 = visaVar_v_as4xi32[i].c7[j];
//    and this has 4 * 4 * 8 = 128 mov instructions !
//
//
// In order to generate such mov instructions explicitly, we insert bitcast between
// the original vector and one we want to use for load and store, and this bitcast
// basically emits movs similar to the conversion code as shown above.  We call
// this bitcast as re-data-layout. The following is the code generated for this
// explicit bitcast (done by emitVectorBitCast):
//     before:   %v = load <16xi8>* p
//
//     after:    %np = bitcast p to <4 x i32>*
//               %nv = load <4 x i32>* np
//               %v  = bitcast nv to <16 x i8>       <<--- re-data-layout bitcast
//
// Since this could potentially generate a lot of movs (may be optimized away),
// bitcasts are inserted only if it is needed.
//
// ** Note, we guarantee that the size of a vector is either 1, 2 bytes,
// ** or multiple of DW at this point. This is guaranteed by VectorPreProcess
// ** (as <3 x i8> cannot be mapped to a single send message, has to be
// ** splitted. We split <3 x i8> in VectorPreProcess so that we don't have
// ** to worry about splitting vector here).
//
// Given a vector < n x T>, the type of load/store is calculated "conceptually"
// as the following, note that if sizeof(T) is 4 or 8, we normally do not
// need to do conversion at all (but there are exception when load/store is
// is mis-aligned). (Keep in mind that sizeof(T)*n is 1|2|multiple-of-DW.)
//    if (n * sizeof(T) < 4 bytes) {
//      <n x T> ---> S; where S is the scalar type whose size == n * sizeof(T);
//    } else if ( (sizeof(T) != 4 && Using A32 message ) ||
//                (sizeof(T) != 4|8 && Using A64 message) ) {
//
//      <n x T>  -->  <n1 x i64>  : sizeof(T) == 8 && A64 messages; or
//                    <n1 x i32>  : otherwise
//    }
//
// For example,
//  (1)   %1 = load <8 x i16> *p
//        converted into
//          new_p = bitcast p to <4 x i32>*
//          %2    = load <4 x i32> *new_p
//          %1    = bitcast %2 to <8 x i16>
//
//  (2)   %1 = load <4 x i64> *p
//        Using A32, converted into
//          new_p = bitcast p to <8 x i32>*
//          %2 = load <8 x i32> *new_p
//          %1 = bitcast %2 to <4 x i64>
//
//        Using A64, do nothing.
//
namespace {
class VectorProcess : public FunctionPass {
public:
  typedef SmallVector<Instruction *, 32> InstWorkVector;

  static char ID; // Pass identification, replacement for typeid
  VectorProcess()
      : FunctionPass(ID), m_DL(nullptr), m_C(nullptr), has_8Byte_A64_BS(true), has_QW_BTS_GS(false), m_WorkList() {
    initializeVectorProcessPass(*PassRegistry::getPassRegistry());
  }
  StringRef getPassName() const override { return "VectorProcess"; }
  bool runOnFunction(Function &F) override;
  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.setPreservesCFG();
    AU.addRequired<CodeGenContextWrapper>();
  }

private:
  bool reLayoutLoadStore(Instruction *Inst);
  bool optimizeBitCast(BitCastInst *BC);
  Value *ProcessMergeValue(Instruction *Inst, Value *V, Type *NewTy, Type *NewIntETy, Type *NewIntTy) const;

private:
  const DataLayout *m_DL;
  LLVMContext *m_C;
  bool has_8Byte_A64_BS; // true if 8-byte A64 Byte scattered is supported
  bool has_QW_BTS_GS;    // true if qword BTS Gather/Scatter is supported
  InstWorkVector m_WorkList;
};
} // namespace

// Register pass to igc-opt
#define PASS_FLAG "igc-vectorprocess"
#define PASS_DESCRIPTION "Process vector loads/stores for explicit vISA variable layout"
#define PASS_CFG_ONLY false
#define PASS_ANALYSIS false
IGC_INITIALIZE_PASS_BEGIN(VectorProcess, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
IGC_INITIALIZE_PASS_END(VectorProcess, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)

char VectorProcess::ID = 0;

FunctionPass *IGC::createVectorProcessPass() { return new VectorProcess(); }

bool VectorProcess::reLayoutLoadStore(Instruction *Inst) {
  LoadInst *const LI = dyn_cast<LoadInst>(Inst);
  StoreInst *const SI = dyn_cast<StoreInst>(Inst);
  GenIntrinsicInst *const II = dyn_cast<GenIntrinsicInst>(Inst);

  Value *Ptr = nullptr;
  Type *Ty = nullptr;
  if (nullptr != LI) {
    Ptr = LI->getPointerOperand();
    Ty = LI->getType();
  } else if (nullptr != SI) {
    IGC_ASSERT(0 < SI->getNumOperands());
    IGC_ASSERT(nullptr != SI->getOperand(0));

    Ptr = SI->getPointerOperand();
    Ty = SI->getOperand(0)->getType();
  } else {
    IGC_ASSERT(nullptr != II);
    IGC_ASSERT(0 < II->getNumOperands());
    IGC_ASSERT(nullptr != II->getOperand(0));

    Ptr = II->getOperand(0);

    if (II->getIntrinsicID() == GenISAIntrinsic::GenISA_ldrawvector_indexed ||
        II->getIntrinsicID() == GenISAIntrinsic::GenISA_ldraw_indexed ||
        II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedLoad) {
      Ty = II->getType();
    } else if (II->getIntrinsicID() == GenISAIntrinsic::GenISA_storerawvector_indexed ||
               II->getIntrinsicID() == GenISAIntrinsic::GenISA_storeraw_indexed) {
      IGC_ASSERT(2 < IGCLLVM::getNumArgOperands(II));
      IGC_ASSERT(nullptr != II->getArgOperand(2));

      Ty = II->getArgOperand(2)->getType();
    } else if (II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedStore) {
      IGC_ASSERT(1 < IGCLLVM::getNumArgOperands(II));
      IGC_ASSERT(nullptr != II->getArgOperand(1));

      Ty = II->getArgOperand(1)->getType();
    } else {
      IGC_ASSERT_MESSAGE(0, "Internal Error: unknown intrinsic");
    }
  }

  IGC_ASSERT(nullptr != Ptr);
  IGC_ASSERT(nullptr != Ty);

  IGCLLVM::FixedVectorType *const VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);

  // Treat a scalar as 1-element vector
  uint32_t nelts = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
  Type *eTy = VTy ? VTy->getElementType() : Ty;
  uint32_t eTyBits = int_cast<unsigned int>(m_DL->getTypeSizeInBits(eTy));

  IGC_ASSERT_MESSAGE((eTyBits == 8 || eTyBits == 16 || eTyBits == 32 || eTyBits == 64),
                     "the Size of Vector element must be 8/16/32/64 bits.");

  uint32_t eTyBytes = (eTyBits >> 3);
  uint32_t TBytes = nelts * eTyBytes; // Total size in bytes

  //
  // Assumption:
  //    1. if the size of vector < 4 bytes, it must be 1 or 2 bytes (never 3);
  //    2. if the size of vector >= 4 bytes, it must be multiple of DW
  // Those 2 assumption are guaranteed by VectorPreProcess.
  //
  // So far, we are using A32 untyped and byte scattered messages,
  // and A64 scattered messages and A64 untyped messages.
  //
  // A32: using DW as the new element type.
  // A64: the new element type will be:
  //        unaligned load/store: DW if no 8-byte A64 byte scattered message
  //                              QW otherwise;
  //        aligned vector of long type:  use QW
  //        others: use DW.
  // For vector whose size is smaller than 4 bytes, they must be converted
  // to a 1-element vector (or scalar) so all elements are read/written with
  // a single message.
  //
  Type *new_eTy;
  uint32_t new_nelts;
  PointerType *PtrTy = cast<PointerType>(Ptr->getType());

  if (TBytes == 1) {
    IGC_ASSERT_MESSAGE(nelts == 1, "Internal Error: something wrong");
    return false;
  } else if (TBytes == 2 || TBytes == 4) {
    if (nelts == 1) {
      // No conversion needed.
      return false;
    }
    new_nelts = 1;
    new_eTy = (TBytes == 2) ? Type::getInt16Ty(*m_C) : Type::getInt32Ty(*m_C);
  } else {
    // This handles all the other cases
    CodeGenContext *cgCtx = nullptr;
    cgCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
    bool useA64 = IGC::isA64Ptr(PtrTy, cgCtx);
    bool useBSS = IGC::DecodeBufferType(PtrTy->getAddressSpace()) == IGC::BINDLESS;
    alignment_t align;
    if (LI) {
      align = IGCLLVM::getAlignmentValue(LI);
    } else if (SI) {
      align = IGCLLVM::getAlignmentValue(SI);
    } else if (II && II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedLoad) {
      align = cast<ConstantInt>(II->getArgOperand(1))->getZExtValue();
    } else if (II && II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedStore) {
      align = cast<ConstantInt>(II->getArgOperand(2))->getZExtValue();
    } else {
      align = 1;
    }

    bool useQW = false;
    if (useA64) {
      useQW = (TBytes % 8 == 0) && ((has_8Byte_A64_BS && align < 4) || (eTyBytes == 8U && align >= 8U));
    } else if (useBSS) {
      useQW = has_QW_BTS_GS && nelts == 1 && (eTyBytes == 8U && align >= 8U);
    }

    if (EmitPass::shouldGenerateLSCQuery(*cgCtx, Inst) == Tristate::True) {
      // With LSC, want to use QW if element size is 8 bytes.
      useQW = (eTyBytes == 8);
    }

    const uint32_t new_eTyBytes = useQW ? 8 : 4;
    if (eTyBytes == new_eTyBytes && !eTy->isAggregateType()) {
      // The original vector is already a good one. Skip.
      return false;
    }
    new_eTy = useQW ? Type::getInt64Ty(*m_C) : Type::getInt32Ty(*m_C);
    IGC_ASSERT(new_eTyBytes);
    IGC_ASSERT_MESSAGE((TBytes % new_eTyBytes) == 0, "Wrong new vector size");
    new_nelts = TBytes / new_eTyBytes;
  }

  IGCIRBuilder<> Builder(Inst);
  Type *newVTy;
  if (new_nelts == 1) {
    newVTy = new_eTy;
  } else {
    newVTy = FixedVectorType::get(new_eTy, new_nelts);
  }
  Type *newPtrTy = PointerType::get(newVTy, PtrTy->getPointerAddressSpace());
  Value *newPtr;
  if (IntToPtrInst *i2p = dyn_cast<IntToPtrInst>(Ptr)) {
    newPtr = Builder.CreateIntToPtr(i2p->getOperand(0), newPtrTy, "IntToPtr2");
  } else {
    newPtr = Builder.CreateBitCast(Ptr, newPtrTy, "vptrcast");
  }

  // These types are needed when we are dealing with pointers
  // and using ptrtoint and inttoptr.
  Type *int_eTy = Type::getIntNTy(*m_C, eTyBits);
  Type *new_intTy = VTy ? FixedVectorType::get(int_eTy, nelts) : int_eTy;

  if (LI || (II && II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedLoad)) {
    Instruction *oldLoad = LI ? cast<Instruction>(LI) : cast<Instruction>(II);
    Instruction *load;
    if (LI) {
      load = Builder.CreateAlignedLoad(newVTy, newPtr, IGCLLVM::getCorrectAlign(IGCLLVM::getAlignmentValue(LI)),
                                       LI->isVolatile(), "vCastload");
    } else {
      Type *types[] = {newVTy, newPtrTy, newVTy};

      Function *F = GenISAIntrinsic::getDeclaration(II->getParent()->getParent()->getParent(),
                                                    GenISAIntrinsic::GenISA_PredicatedLoad, types);
      load = Builder.CreateCall4(F, newPtr, II->getOperand(1), II->getOperand(2),
                                 ProcessMergeValue(Inst, II->getOperand(3), newVTy, int_eTy, new_intTy));
    }
    load->copyMetadata(*oldLoad);

    Value *V = load;

    if (eTy->isPointerTy()) {
      // cannot bitcast int to ptr; need to use intToptr.
      // First, cast the loaded value to a vector type that is same to
      //        the original vector type with ptr element type replaced
      //        with int-element type.
      // second, IntToPtr cast to the original vector type.
      V = Builder.CreateBitCast(V, new_intTy);
      if (VTy) {
        // If we need a vector inttoptr, scalarize it here.
        auto *BC = V;
        V = UndefValue::get(Ty);
        for (unsigned i = 0; i < nelts; i++) {
          auto *EE = Builder.CreateExtractElement(BC, i);
          auto *ITP = Builder.CreateIntToPtr(EE, eTy);
          V = Builder.CreateInsertElement(V, ITP, i);
        }
      } else {
        V = Builder.CreateIntToPtr(V, Ty);
      }
    } else {
      // TODO: if Ty is Aggregate type then this bitCast conradicts to LLVM spec
      V = Builder.CreateBitCast(V, Ty);
    }
    oldLoad->replaceAllUsesWith(V);
    oldLoad->eraseFromParent();
  } else if (SI || (II && II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedStore)) {
    Instruction *oldStore = SI ? cast<Instruction>(SI) : cast<Instruction>(II);
    Value *StoreVal = SI ? SI->getValueOperand() : II->getArgOperand(1);
    Value *V;
    if (eTy->isPointerTy()) {

      // Similar to the load. First, PtrtoInt cast to a new vector,
      // and then bitcast to the stored type.
      Type *int_eTy = Type::getIntNTy(*m_C, eTyBits);
      if (VTy) {
        // If we need a vector inttoptr, scalarize it here.
        V = UndefValue::get(FixedVectorType::get(int_eTy, nelts));
        for (unsigned i = 0; i < nelts; i++) {
          auto *EE = Builder.CreateExtractElement(StoreVal, i);
          auto *ITP = Builder.CreatePtrToInt(EE, int_eTy);
          V = Builder.CreateInsertElement(V, ITP, i);
        }
      } else if (isa<IntToPtrInst>(StoreVal) && cast<IntToPtrInst>(StoreVal)->getOperand(0)->getType() == int_eTy) {
        // Detect case when creating PtrToInt and BitCast instructions
        // is not needed. This is when store value is created from
        // a vector with the same type as the target vector type.
        //
        // e.g. example from a Vulkan shader with variable pointers:
        // Before:
        //     %7 = bitcast <2 x i32> %assembled.vect7 to i64
        //     %Temp-26.i.VP = inttoptr i64 %7 to i32 addrspace(1179648)*
        //     store i32 addrspace(1179648)* %Temp-26.i.VP, i32 addrspace(1179648)** %6, align 8
        // After:
        //     store <2 x i32> %assembled.vect7, <2 x i32>* %vptrcast, align 8

        V = cast<IntToPtrInst>(StoreVal)->getOperand(0);
      } else {
        V = Builder.CreatePtrToInt(StoreVal, int_eTy);
      }

      if (isa<BitCastInst>(V) && (cast<BitCastInst>(V)->getOperand(0)->getType() == newVTy)) {
        V = cast<BitCastInst>(V)->getOperand(0);
      } else {
        V = Builder.CreateBitCast(V, newVTy);
      }
    } else {
      V = Builder.CreateBitCast(StoreVal, newVTy);
    }

    Instruction *store = nullptr;
    if (SI && IGCLLVM::getAlignmentValue(SI) == 0) {
      store = Builder.CreateStore(V, newPtr, SI->isVolatile());
    } else if (SI) {
      store = Builder.CreateAlignedStore(V, newPtr, IGCLLVM::getAlign(*SI), SI->isVolatile());
    } else {
      Type *types[] = {newPtrTy, newVTy};

      Function *F = GenISAIntrinsic::getDeclaration(II->getParent()->getParent()->getParent(),
                                                    GenISAIntrinsic::GenISA_PredicatedStore, types);
      store = Builder.CreateCall4(F, newPtr, V, II->getOperand(2), II->getOperand(3));
    }
    store->copyMetadata(*oldStore);
    oldStore->eraseFromParent();
  } else if (II->getIntrinsicID() == GenISAIntrinsic::GenISA_ldrawvector_indexed ||
             II->getIntrinsicID() == GenISAIntrinsic::GenISA_ldraw_indexed) {
    Type *types[] = {newVTy, newPtrTy};

    Function *F = GenISAIntrinsic::getDeclaration(II->getParent()->getParent()->getParent(),
                                                  GenISAIntrinsic::GenISA_ldrawvector_indexed, types);
    Value *V = Builder.CreateCall4(F, newPtr, II->getOperand(1), II->getOperand(2), II->getOperand(3));

    if (eTy->isPointerTy()) {
      Type *intETy = Type::getIntNTy(*m_C, eTyBits);
      Type *newIntTy = VTy ? IGCLLVM::FixedVectorType::get(intETy, nelts) : intETy;
      V = Builder.CreateBitCast(V, newIntTy);
      V = Builder.CreateIntToPtr(V, Ty);
    } else {
      V = Builder.CreateBitCast(V, Ty);
    }

    II->replaceAllUsesWith(V);
    II->eraseFromParent();
  } else {
    IGC_ASSERT(II->getIntrinsicID() == GenISAIntrinsic::GenISA_storerawvector_indexed ||
               II->getIntrinsicID() == GenISAIntrinsic::GenISA_storeraw_indexed);
    Type *types[] = {newPtrTy, newVTy};

    Function *F = GenISAIntrinsic::getDeclaration(II->getParent()->getParent()->getParent(),
                                                  GenISAIntrinsic::GenISA_storerawvector_indexed, types);

    Value *V;
    if (eTy->isPointerTy()) {
      Type *intETy = Type::getIntNTy(*m_C, eTyBits);
      Type *newIntTy = VTy ? IGCLLVM::FixedVectorType::get(intETy, nelts) : intETy;
      V = Builder.CreatePtrToInt(II->getOperand(2), newIntTy);
      V = Builder.CreateBitCast(V, newVTy);
    } else {
      V = Builder.CreateBitCast(II->getOperand(2), newVTy);
    }

    Builder.CreateCall5(F, newPtr, II->getOperand(1), V, II->getOperand(3), II->getOperand(4));
    II->eraseFromParent();
  }
  return true;
}

bool VectorProcess::optimizeBitCast(BitCastInst *BC) {
  bool change = false;
  Value *Src = BC->getOperand(0);
  Type *SrcTy = Src->getType();
  Type *Ty = BC->getType();

  if (Ty == SrcTy) {
    BC->replaceAllUsesWith(Src);
    return true;
  }

  // Only handle non-pointer bitcast
  if (isa<PointerType>(Ty) || isa<PointerType>(SrcTy)) {
    return false;
  }

  for (Value::user_iterator UI = BC->user_begin(), UE = BC->user_end(); UI != UE; ++UI) {
    if (BitCastInst *Inst = dyn_cast<BitCastInst>(*UI)) {
      IRBuilder<> Builder(Inst);
      Type *Ty1 = Inst->getType();
      if (SrcTy == Ty1) {
        Inst->replaceAllUsesWith(Src);
      } else {
        BitCastInst *nBC = (BitCastInst *)Builder.CreateBitCast(Src, Ty1);
        Inst->replaceAllUsesWith(nBC);

        // Add nBC so it will be processed again.
        m_WorkList.push_back(nBC);
      }
      change = true;
    }
  }
  return change;
}

bool VectorProcess::runOnFunction(Function &F) {
  CodeGenContext *cgCtx = nullptr;
  cgCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
  bool changed = false;
  m_DL = &F.getParent()->getDataLayout();
  m_C = &F.getContext();
  has_8Byte_A64_BS = cgCtx->platform.has8ByteA64ByteScatteredMessage();
  has_QW_BTS_GS = cgCtx->platform.hasQWGatherScatterBTSMessage();

  //  Adjust load/store layout by inserting bitcast.
  //  Those bitcasts should not be optimized away.
  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
    Instruction *inst = &*I;
    if (isa<LoadInst>(inst) || isa<StoreInst>(inst)) {
      m_WorkList.push_back(inst);
    } else if (GenIntrinsicInst *intrin = dyn_cast<GenIntrinsicInst>(inst)) {
      if (intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_ldrawvector_indexed ||
          intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_ldraw_indexed ||
          intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_storerawvector_indexed ||
          intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_storeraw_indexed ||
          intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedLoad ||
          intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedStore) {
        m_WorkList.push_back(inst);
      }
    }
  }

  for (unsigned i = 0; i < m_WorkList.size(); ++i) {
    if (reLayoutLoadStore(m_WorkList[i])) {
      changed = true;
    }
  }
  m_WorkList.clear();

  // To remove unnecessary bitcast
  if (changed) {
    for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
      Instruction *inst = &*I;
      if (isa<BitCastInst>(inst)) {
        m_WorkList.push_back(inst);
      }
    }

    bool doclean = false;
    for (unsigned i = 0; i < m_WorkList.size(); ++i) {
      if (BitCastInst *Inst = dyn_cast<BitCastInst>(m_WorkList[i])) {
        if (optimizeBitCast(Inst)) {
          doclean = true;
        }
      }
    }

    while (doclean) {
      // Given  b2 = bitcast A,  T2
      //        b1 = bitcast b2, T1
      // we say b1's level is 1, b2's level is 2.
      //
      // This pass, in theory, can have two-level dead bitcasts.
      // Therefore, we expect "while" will take three iterations at most. And
      // WorkList is the set of bitcasts,  which isn't expected to be big.
      doclean = false;
      for (unsigned i = 0; i < m_WorkList.size(); ++i) {
        if (m_WorkList[i] && m_WorkList[i]->use_empty()) {
          m_WorkList[i]->eraseFromParent();
          m_WorkList[i] = NULL;
          doclean = true;
        }
      }
    }

    m_WorkList.clear();
  }
  // DumpLLVMIR(cgCtx, "vectorprocess");
  return changed;
}

Value *VectorProcess::ProcessMergeValue(Instruction *Inst, Value *V, Type *NewTy, Type *NewIntEType,
                                        Type *NewIntTy) const {
  // if V is a zero initializer, undef or poison value, we just need to create
  // corresponding value of NewTy.
  if (isa<ConstantAggregateZero>(V)) {
    if (IGCLLVM::FixedVectorType *NewVTy = dyn_cast<IGCLLVM::FixedVectorType>(NewTy))
      return ConstantAggregateZero::get(NewVTy);
    else
      return Constant::getNullValue(NewTy);
  }

  if (isa<PoisonValue>(V))
    return PoisonValue::get(NewTy);

  if (isa<UndefValue>(V))
    return UndefValue::get(NewTy);

  IRBuilder<> Builder(Inst);

  Type *Ty = V->getType();
  IGCLLVM::FixedVectorType *const VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
  uint32_t nelts = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
  Type *eTy = VTy ? VTy->getElementType() : Ty;

  if (eTy->isPointerTy()) {
    // cannot bitcast ptr to int; First, PtrToInt cast
    // then bitcast int (scalar or vector) to the new type.
    if (VTy) {
      // need a vector ptrtoint, scalarize:
      auto *oldV = V;
      V = UndefValue::get(NewIntTy);
      for (unsigned i = 0; i < nelts; ++i) {
        auto *EE = Builder.CreateExtractElement(oldV, i);
        auto *PTI = Builder.CreatePtrToInt(EE, NewIntEType);
        V = Builder.CreateInsertElement(V, PTI, i);
      }
    } else {
      V = Builder.CreatePtrToInt(V, NewIntTy);
    }
  }

  return Builder.CreateBitCast(V, NewTy);
}

//
// getInfo maps vector to the right messages. It assume that a vector
// can be mapped to more than one messages, and those messages may be
// different as long as the message returns exactly the same "packed form"
// of the vector.
//
// getInfo() initializes the array of struct (insts), which specifies
// the number of send instructions (or gathers/scatters visa instructions)
// needed to read/write this vector into vISA variable. The clients will
// access this array of struct directly after getInfo() call.
//
// VectorProcess() will change each vector load and store into a new vector
// load and store that can map exactly to these messages. getInfo() has
// the following agreement with VectorProcess():
//   1) If sizeof(Ty) >= 4 bytes, sizeof(Ty) must be multiple of 4 bytes.
//      And futhermore, the element type of 'Ty' if 'Ty" is a vector type
//      or 'Ty' if 'Ty' is a scalar type, must be either 4 bytes (DW) or
//      8 bytes (QW).
//   2) If sizeof(Ty) < 4 bytes, sizeof(Ty) must be either 1 byte or
//      2 bytes. The sizeof(Ty) cannot be 3 bytes!
// (Note that VectorMessage and VectorProcess must be in sync with regard
//  to this agreetment.)
//
void VectorMessage::getInfo(Type *Ty, uint64_t Align, bool useA32, bool forceByteScatteredRW) {
  VectorType *VTy = dyn_cast<VectorType>(Ty);
  Type *eTy = VTy ? cast<VectorType>(VTy)->getElementType() : Ty;
  unsigned eltSize = Shader->GetScalarTypeSizeInRegister(eTy);
  unsigned nElts = VTy ? (unsigned)cast<IGCLLVM::FixedVectorType>(VTy)->getNumElements() : 1;
  // total bytes
  const unsigned TBytes = nElts * eltSize;

  // Per-channel Max Bytes (MB) that can be read/written by a single send inst
  unsigned MB;
  SIMDMode SM = Shader->m_SIMDSize;
  bool has_8B_A64_BS = Shader->m_Platform->has8ByteA64ByteScatteredMessage();
  bool has_8DW_A64_SM = Shader->m_Platform->has8DWA64ScatteredMessage();

  //
  // Set up default message and the data type of the message
  //
  MESSAGE_KIND defaultKind;
  VISA_Type defaultDataType;
  if (Align < 4 || TBytes < 4 || forceByteScatteredRW) {
    if (forceByteScatteredRW) {
      IGC_ASSERT(useA32);
    }
    defaultKind = useA32 ? MESSAGE_A32_BYTE_SCATTERED_RW : MESSAGE_A64_SCATTERED_RW;
    MB = useA32 ? A32_BYTE_SCATTERED_MAX_BYTES
                : ((has_8B_A64_BS && eltSize == 8) ? A64_BYTE_SCATTERED_MAX_BYTES_8B : A64_BYTE_SCATTERED_MAX_BYTES);
    defaultDataType = ISA_TYPE_UB;

    // To make sure that vector and message match.
    IGC_ASSERT_MESSAGE((MB == eltSize || (MB > eltSize && nElts == 1)), "Internal Error: mismatch layout for vector");
  } else {
    defaultKind = useA32 ? MESSAGE_A32_UNTYPED_SURFACE_RW : MESSAGE_A64_SCATTERED_RW;

    MB = useA32 ? A32_UNTYPED_MAX_BYTES
                : ((has_8DW_A64_SM && SM == SIMDMode::SIMD8) ? A64_SCATTERED_MAX_BYTES_8DW_SIMD8
                                                             : A64_SCATTERED_MAX_BYTES_4DW);

    bool allowQWMessage = !useA32 && eltSize == 8 && Align >= 8U;

    defaultDataType = (eltSize == 8) ? ISA_TYPE_UQ : ISA_TYPE_UD;
    // To make sure that send returns the correct layout for vector.
    IGC_ASSERT_MESSAGE((eltSize == 4 /* common */ || allowQWMessage /* A64, QW */),
                       "Internal Error: mismatch layout for vector");
  }

  MESSAGE_KIND kind = defaultKind;
  VISA_Type dataType = defaultDataType;
  unsigned bytes = TBytes;
  size_t i = 0;
  for (; bytes >= MB; ++i, bytes -= MB) {
    IGC_ASSERT(i < (sizeof(insts) / sizeof(*insts)));
    insts[i].startByte = (uint16_t)(TBytes - bytes);
    insts[i].kind = kind;
    insts[i].blkType = dataType;
    insts[i].blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);
    IGC_ASSERT(insts[i].blkInBytes);
    insts[i].numBlks = MB / insts[i].blkInBytes;
  }

  // Process the remaining elements if any. It could have at most
  // two separate sends. For example, assuming the remaining bytes
  // are for <7 x i32> and it is for A64 SIMD8 with align >=4; thus
  // we will need two sends: one for the first <4 x i32> and the
  // second for  the remaining <3 x i32>.
  if (MB == A64_SCATTERED_MAX_BYTES_8DW_SIMD8) {          // MB == 32 bytes
    unsigned MB2 = A64_SCATTERED_MAX_BYTES_8DW_SIMD8 / 2; // 16 bytes
    if (bytes > MB2) {
      IGC_ASSERT(i < (sizeof(insts) / sizeof(*insts)));
      insts[i].startByte = (uint16_t)(TBytes - bytes);
      insts[i].kind = kind;
      insts[i].blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);
      IGC_ASSERT(insts[i].blkInBytes);
      insts[i].numBlks = MB2 / insts[i].blkInBytes;
      ++i;
      bytes -= MB2;
    }
  }

  if (bytes > 0) {
    if (Align >= 4) {
      if (!useA32 && eltSize == 4 && bytes == 12) {
        kind = MESSAGE_A64_UNTYPED_SURFACE_RW;
      }
    }

    IGC_ASSERT(i < (sizeof(insts) / sizeof(*insts)));
    insts[i].startByte = (uint16_t)(TBytes - bytes);
    insts[i].kind = kind;
    insts[i].blkType = dataType;
    insts[i].blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);
    IGC_ASSERT(insts[i].blkInBytes);
    insts[i].numBlks = (uint16_t)bytes / insts[i].blkInBytes;
    ++i;
  }

  numInsts = i;
  IGC_ASSERT_MESSAGE(numInsts <= VECMESSAGEINFO_MAX_LEN,
                     "Vector's size is too big, increase MAX_VECMESSAGEINFO_LEN to fix it!");
  IGC_ASSERT_MESSAGE(numInsts <= (sizeof(insts) / sizeof(*insts)),
                     "Vector's size is too big, increase MAX_VECMESSAGEINFO_LEN to fix it!");
}

void VectorMessage::getLSCInfo(llvm::Type *Ty, uint64_t Align, CodeGenContext *ctx, bool useA32, bool transpose) {
  IGC_ASSERT(nullptr != ctx);
  IGC_ASSERT(nullptr != Shader);

  IGCLLVM::FixedVectorType *VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
  Type *eTy = VTy ? VTy->getContainedType(0) : Ty;
  unsigned eltSize = Shader->GetScalarTypeSizeInRegister(eTy);
  unsigned nElts = VTy ? (unsigned)VTy->getNumElements() : 1;
  // total bytes
  const unsigned TBytes = nElts * eltSize;
  char TRANS_VEC_SIZE[8] = {1, 2, 3, 4, 8, 16, 32, 64};
  MESSAGE_KIND kind = useA32 ? MESSAGE_A32_LSC_RW : MESSAGE_A64_LSC_RW;

  VISA_Type dataType = GetType(Ty, ctx);
  uint16_t blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);

  // Per-channel Max Bytes (MB) that can be read/written by a single send inst
  const unsigned int numLanesForSIMDSize = numLanes(Shader->m_SIMDSize);
  IGC_ASSERT(numLanesForSIMDSize);
  unsigned int MB = (8 * ctx->platform.getGRFSize()) / numLanesForSIMDSize;
  if (Align < 4 || (eltSize == 8 && Align < 8)) {
    MB = eltSize;
  }

  size_t i = 0;
  if (transpose) {
    unsigned bytes = TBytes;
    for (int j = 0; j < 8; j++) {
      const unsigned int denominator = blkInBytes * TRANS_VEC_SIZE[7 - j];
      IGC_ASSERT(denominator);

      if (bytes % denominator == 0) {
        IGC_ASSERT(i < (sizeof(insts) / sizeof(*insts)));
        insts[i].startByte = (uint16_t)(TBytes - bytes);
        insts[i].kind = kind;
        insts[i].blkType = dataType;
        insts[i].blkInBytes = blkInBytes;
        insts[i].numBlks = TRANS_VEC_SIZE[7 - j];
        bytes -= insts[i].numBlks * blkInBytes;
        i++;
        break;
      } else //
      {
        if (bytes / denominator != 0) {
          IGC_ASSERT(i < (sizeof(insts) / sizeof(*insts)));
          insts[i].startByte = (uint16_t)(TBytes - bytes);
          insts[i].kind = kind;
          insts[i].blkType = dataType;
          insts[i].blkInBytes = blkInBytes;
          insts[i].numBlks = TRANS_VEC_SIZE[7 - j];
          bytes -= insts[i].numBlks * blkInBytes;
          i++;
        } // else j++;
      }
    }
    IGC_ASSERT(bytes == 0);
  } else {
    unsigned bytes = TBytes;
    for (; bytes >= MB; ++i, bytes -= MB) {
      insts[i].startByte = (uint16_t)(TBytes - bytes);
      insts[i].kind = kind;
      insts[i].blkType = dataType;
      insts[i].blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);
      IGC_ASSERT(insts[i].blkInBytes);
      insts[i].numBlks = MB / insts[i].blkInBytes;
    }

    if (bytes > 0) {
      insts[i].startByte = (uint16_t)(TBytes - bytes);
      insts[i].kind = kind;
      insts[i].blkType = dataType;
      insts[i].blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);
      IGC_ASSERT(insts[i].blkInBytes);
      insts[i].numBlks = (uint16_t)bytes / insts[i].blkInBytes;
      ++i;
    }
  }

  numInsts = i;
  IGC_ASSERT_MESSAGE(numInsts <= VECMESSAGEINFO_MAX_LEN,
                     "Vector's size is too big, increase MAX_VECMESSAGEINFO_LEN to fix it!");
  IGC_ASSERT_MESSAGE(numInsts <= (sizeof(insts) / sizeof(*insts)),
                     "Vector's size is too big, increase MAX_VECMESSAGEINFO_LEN to fix it!");
}

VectorMessage::VectorMessage(EmitPass *emitter) : Shader(emitter->m_currShader) { numInsts = 0; }
